#!/bin/env python
# -*- coding: utf-8 -*-
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (C) 2023, GEM Foundation
#
# OpenQuake is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OpenQuake is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with OpenQuake.  If not, see <http://www.gnu.org/licenses/>.

import csv
import os
import logging
import pandas
import numpy
from openquake.baselib import sap, general, performance
from openquake.hazardlib import nrml, gsim_lt, site
from openquake.risklib.riskmodels import CompositeRiskModel, RiskFuncList
from openquake.risklib.asset import _get_exposure
from openquake.commonlib.datastore import create_job_dstore
from openquake.commonlib.oqvalidation import OqParam
from openquake.commonlib import expo_to_hdf5
from openquake.engine.global_ses import dt

U16 = numpy.uint16
F32 = numpy.float32


def collect_exposures(grm_dir):
    """
    Collect the files of kind Exposure_<Country>.xml.

    :returns: xmlfiles
    """
    out = []
    for region in os.listdir(grm_dir):
        expodir = os.path.join(grm_dir, region, 'Exposure', 'Exposure')
        if not os.path.exists(expodir):
            continue
        for fname in os.listdir(expodir):
            if fname.startswith('Exposure_'):  # i.e. Exposure_Chile.xml
                fullname = os.path.join(expodir, fname)
                out.append(fullname)
    return out


def read_world_vulnerability(grm_dir, dstore):
    """
    Store the world CompositeRiskModel
    """
    kinds = ['structural', 'nonstructural', 'contents', 'area', 'number',
             'fatalities', 'residents', 'affectedpop', 'injured']
    vfuncs = RiskFuncList()
    for kind in kinds:
        name = f'Vulnerability/vulnerability/vulnerability_{kind}.xml'
        fname = os.path.join(grm_dir, name)
        for vf in nrml.to_python(fname).values():
            vf.loss_type = 'occupants' if kind == 'fatalities' else kind
            vf.kind = 'vulnerability'
            vfuncs.append(vf)
    oq = OqParam(calculation_mode='custom')
    crmodel = CompositeRiskModel(oq, vfuncs)
    dstore.create_df('crm', crmodel.to_dframe(),
                     'gzip', **crmodel.get_attrs())
    return len(vfuncs)


def get_gsim_lt(cwd):
    """
    :returns: a GsimLogicTree instance
    """
    f1 = os.path.join(cwd, 'gmmLTrisk.xml')
    if os.path.exists(f1):
        return gsim_lt.GsimLogicTree(f1)
    f2 = os.path.join(cwd, 'gmmLT.xml')
    return gsim_lt.GsimLogicTree(f2)


def discard_dupl(records):
    # discard duplicate sites
    acc = {}
    for rec in records:
        lonlat = rec['lon'], rec['lat']
        if lonlat not in acc:
            acc[lonlat] = rec
    return numpy.array(list(acc.values()), dtype=rec.dtype)


def build_site_model(grm_dir):
    """
    Build the global site model starting from the CSV files in the
    global_risk_model directory
    """
    factor = float(os.environ.get('OQ_SAMPLE_ASSETS', 1))
    dfs = []
    for cwd, dirs, files in os.walk(grm_dir):
        for f in files:
            if f.startswith('Site_model_') and f.endswith('.csv'):
                print('Reading %s' % f)
                df = pandas.read_csv(os.path.join(cwd, f))
                if factor < 1:
                    df = general.random_filter(df, factor*10)
                if len(df):
                    dfs.append(df)
    columns = set()
    n = 0
    for df in dfs:
        columns.update(df.columns)
        n += len(df)
    dt = [(col, site.site_param_dt[col]) for col in sorted(columns)]
    sm = numpy.zeros(n, dt)
    row = 0
    for df in dfs:
        n = len(df)
        recs = sm[row:row + n]
        for col in df.columns:
            val = df[col].to_numpy()
            if col in ('lon', 'lat'):
                recs[col] = numpy.round(val, 5)
            else:
                recs[col] = val
        row += n
    return discard_dupl(sm)


def build_site_model_gsims(mosaic_dir, grm_dir, dstore):
    """
    Storing the global site_model and gsim table
    """
    rows = []
    for cwd, dirs, files in os.walk(mosaic_dir):
        for f in files:
            if f == 'job_vs30.ini':
                model = cwd.split('/')[-2]
                gsim_lt = get_gsim_lt(cwd)
                for trt, gsims in gsim_lt.values.items():
                    for gsim in gsims:
                        q = (model, trt, str(gsim), gsim.weight['default'])
                        rows.append(q)
    smodel = build_site_model(grm_dir)
    dstore['site_model'] = smodel
    dstore['model_trt_gsim_weight'] = numpy.array(rows, dt)
    return len(smodel)


def find_long_strings_and_export(csv_paths, fieldname, output_csv_path,
                                 min_length=32):
    results = []
    for file_path in csv_paths:
        try:
            with open(file_path, newline='', encoding='utf-8') as f:
                reader = csv.reader(f)
                headers = next(reader, None)
                if not headers or fieldname not in headers:
                    continue  # skip if the field is missing
                field_index = headers.index(fieldname)
                for i, row in enumerate(reader, start=2):  # skip header
                    if len(row) > field_index:
                        value = row[field_index]
                        if len(value) > min_length:
                            results.append({
                                "filename": file_path,
                                "linenumber": i,
                                fieldname: value,
                                f"{fieldname}_len": len(value)
                            })
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    if results:
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as out:
            fieldnames = ["filename", "linenumber", fieldname,
                          f"{fieldname}_len"]
            writer = csv.DictWriter(out, fieldnames=fieldnames)
            writer.writeheader()
            for row in results:
                writer.writerow(row)
        print(f"Results saved to {output_csv_path}")
    else:
        print("No strings longer than {} characters found.".format(min_length))


def main(mosaic_dir, grm_dir, wfp=False, action='build'):
    """
    If action='build' (the default) stores the global exposure and tmap.
    If action='zip' zips the global exposure in a single file exposures.zip
    If action='find_long_XXX' find the fields XXX longer than 32 chars
    """
    if action.startswith('find_long'):  # i.e. find_long_NAME_2
        fieldname = action[10:]
        exposures_xml = collect_exposures(grm_dir)
        csv_files = []
        for xml in exposures_xml:
            exposure, _ = _get_exposure(xml, stop='asset')
            csv_files.extend(exposure.datafiles)
        output_file = "long_strings.csv"
        find_long_strings_and_export(csv_files, fieldname, output_file)
        return
    elif action == 'zip':
        tmaps = sorted(expo_to_hdf5.read_world_tmap(grm_dir))
        exposures_xml = collect_exposures(grm_dir)
        csv_files = []
        for xml in exposures_xml:
            print(f'Reading {xml}')
            exposure, _ = _get_exposure(xml, stop='asset')
            csv_files.extend(exposure.datafiles)
        n = len(tmaps) + len(exposures_xml) + len(csv_files)
        print(f'Zipping {n} files')
        a = general.zipfiles(tmaps + exposures_xml + csv_files, 'exposures.zip')
        print(f'Saved {a} [{general.humansize(os.path.getsize(a))}]')
        return

    mon = performance.Monitor(measuremem=True)
    description = 'Storing global exposure to hdf5'
    if wfp:
        description += ' (only for WFP countries)'
    job, dstore = create_job_dstore(description=description)
    with dstore, job:
        with mon:
            n = build_site_model_gsims(mosaic_dir, grm_dir, dstore)
            logging.info('Stored {:_d} sites'.format(n))
            n = read_world_vulnerability(grm_dir, dstore)
            logging.info('Read %d vulnerability functions', n)
            fnames = collect_exposures(grm_dir)
            expo_to_hdf5.store(fnames, grm_dir, wfp, dstore)
        logging.info(mon)


main.mosaic_dir = 'Directory containing the hazard mosaic'
main.grm_dir = 'Directory containing the global risk model'
main.wfp = 'If true, consider only 7 countries for the World Food Program'
main.action = 'Perform an action (build, zip, find_long_<FIELD>)'

if __name__ == '__main__':
    sap.run(main)
